{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Document360"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simple Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.readers.document360 import Document360Reader\n",
    "\n",
    "api_key = \"document360_api_key\"\n",
    "\n",
    "reader = Document360Reader(api_key=api_key)\n",
    "\n",
    "documents = reader.load_data()\n",
    "\n",
    "for d in documents:\n",
    "    print(d.text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Customize Document360Reader Example"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filter entities to process\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "from llama_index.readers.document360.entities import (\n",
    "    Article,\n",
    "    ArticleSlim,\n",
    "    Category,\n",
    "    ProjectVersion,\n",
    ")\n",
    "\n",
    "from llama_index.readers.document360 import Document360Reader\n",
    "\n",
    "\n",
    "def should_process_project_version(project_version: ProjectVersion):\n",
    "    project_versions_of_interest = [\"document360_project_version_id\"]\n",
    "\n",
    "    return project_version.get_id() in project_versions_of_interest:\n",
    "\n",
    "# parent_categories of the current category are passed into the function parameters\n",
    "def should_process_category(\n",
    "    category: Category, parent_categories: list[Category]\n",
    "):\n",
    "    categories_of_interest = [\"document360_category_id\"]\n",
    "\n",
    "    return category.get_id() in categories_of_interest\n",
    "\n",
    "def should_process_article(article: ArticleSlim):\n",
    "    return article.get_title() !== \"Do not process this article\"\n",
    "\n",
    "\n",
    "# Initialize the Document360Reader\n",
    "reader = Document360Reader(\n",
    "    api_key=api_key,\n",
    "    should_process_project_version=should_process_project_version,\n",
    "    should_process_category=should_process_category,\n",
    "    should_process_article=should_process_article,\n",
    ")\n",
    "\n",
    "reader.load_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Customizing Error Handling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "from llama_index.readers.document360.entities import (\n",
    "    Article,\n",
    "    ArticleSlim,\n",
    ")\n",
    "\n",
    "from llama_index.readers.document360 import Document360Reader\n",
    "\n",
    "\n",
    "def handle_rate_limit_error():\n",
    "    logging.error(\"Rate limit exceeded. Retrying...\")\n",
    "\n",
    "\n",
    "def handle_request_http_error(e: Exception):\n",
    "    logging.error(f\"HTTP Request failed. {e}\")\n",
    "\n",
    "\n",
    "def handle_article_processing_error(e: Exception, article: Union[Article, ArticleSlim]):\n",
    "    logging.error(f\"Failed to process {article}: {e}\")\n",
    "\n",
    "\n",
    "def handle_load_data_error(e: Exception):\n",
    "    logging.error(f\"Load data error: {e}\")\n",
    "\n",
    "\n",
    "# Initialize the Document360Reader\n",
    "reader = Document360Reader(\n",
    "    api_key=api_key,\n",
    "    handle_rate_limit_error=handle_rate_limit_error,\n",
    "    handle_request_http_error=handle_request_http_error,\n",
    "    handle_article_processing_error=handle_article_processing_error,\n",
    "    handle_load_data_error=handle_load_data_error,\n",
    ")\n",
    "\n",
    "reader.load_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hook into the Document360Reader Lifecycle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "from llama_index.readers.document360.entities import Article, Category\n",
    "\n",
    "from llama_index.readers.document360 import Document360Reader\n",
    "\n",
    "\n",
    "def handle_batch_finished():\n",
    "    logging.info(\"Batch finished processing\")\n",
    "\n",
    "\n",
    "def handle_category_processing_started(category: Category):\n",
    "    logging.info(f\"Started processing category: {category}\")\n",
    "\n",
    "\n",
    "def handle_article_processing_started(article: Article):\n",
    "    logging.info(f\"Processing article: {article}\")\n",
    "\n",
    "\n",
    "# Initialize the Document360Reader\n",
    "reader = Document360Reader(\n",
    "    api_key=api_key,\n",
    "    handle_batch_finished=handle_batch_finished,\n",
    "    handle_category_processing_started=handle_category_processing_started,\n",
    "    handle_article_processing_started=handle_article_processing_started,\n",
    ")\n",
    "\n",
    "reader.load_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a custom llama_index Document from Document360 Article"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "from llama_index.readers.document360.entities import Article\n",
    "from llama_index.readers.document360 import Document360Reader\n",
    "\n",
    "# Your class that handles how to process your Document360 article\n",
    "from your_module import ProcessedArticle, LinkExtractor\n",
    "\n",
    "\n",
    "def article_to_custom_document(article: Article):\n",
    "    processed_article = ProcessedArticle(article=article)\n",
    "\n",
    "    # for example, you might want to extract links on the article page\n",
    "    processed_article.extract_links(LinkExtractor())\n",
    "    links = processed_article.get_links()\n",
    "\n",
    "    return Document(\n",
    "        doc_id=article.get_id(),\n",
    "        text=strip_html(article.get_html_content()),\n",
    "        extra_info={\n",
    "            \"title\": article.get_title(),\n",
    "            \"category_id\": article.get_category_id(),\n",
    "            \"project_version_id\": article.get_project_version_id(),\n",
    "            \"created_by\": article.get_created_by(),\n",
    "            \"created_at\": article.get_created_at(),\n",
    "            \"modified_at\": article.get_modified_at(),\n",
    "            \"url\": article.get_url(),\n",
    "            \"links\": links,\n",
    "        },\n",
    "    )\n",
    "\n",
    "\n",
    "# Initialize the Document360Reader\n",
    "reader = Document360Reader(\n",
    "    api_key=api_key,\n",
    "    article_to_custom_document=article_to_custom_document,\n",
    ")\n",
    "\n",
    "reader.load_data()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
